---
title: "torres"
format: html
editor: visual
---

## Figure 13 (Torres)

```{r}
# you need one file here, torres_stm_and_annotations

# libraries
library(dsl)
library(jtools)
library(tidyverse)
library(broom)
library(gridExtra)
library(devtools)
torres_df <-read_csv("~/Downloads/DSL images/torres-stm-and-annotations.csv")


# code 
torres_df
cor(torres_df$clip, torres_df$first)

truth_model <- lm(formula = first ~ pol -1, data = torres_df)
torres_model <- lm(formula = value ~ pol -1, data = torres_df)
clip_model <- lm(formula = clip ~  pol -1, data = torres_df)

plot_summs(truth_model, torres_model, clip_model, model.names = c("our annotations", "torres dense crowds", "clip"))

# apply dsl
dsl_torres <- torres_df
dsl_torres$row = 1:nrow(dsl_torres)
rm(.Random.seed, envir=globalenv())
sample25 <- sample(dsl_torres$row, size = 488)
dsl_torres$first[dsl_torres$row %in% sample25] <- NA
dsl_torres <- as.data.frame(dsl_torres)
dsl_torres
```

```{r}
# dsl on topicality
dsl_25_model <- dsl(formula = first ~ pol -1, data =  dsl_torres, predicted_var  =  "first", prediction = "value", model = "lm")
summary(dsl_25_model)
```

```{r}
# prepare for plot
clip_tidy <- clip_model %>% tidy(conf.int = TRUE)
truth_tidy <- truth_model  %>% tidy(conf.int = TRUE)
torres_tidy <- torres_model  %>% tidy(conf.int = TRUE)
dsl_torres_tidy <- as.data.frame(summary(dsl_25_model))
dsl_torres_tidy['term'] <- row.names(dsl_torres_tidy)
row.names(dsl_torres_tidy) <- c(1:6)

dsl_torres_tidy['estimate'] <- dsl_torres_tidy['Estimate']
dsl_torres_tidy['std.error'] <- dsl_torres_tidy$`Std. Error`
dsl_torres_tidy['p.value'] <- dsl_torres_tidy$`p value`
dsl_torres_tidy['conf.low'] <- dsl_torres_tidy$`CI Lower`
dsl_torres_tidy['conf.high'] <- dsl_torres_tidy$`CI Upper`
dsl_torres_tidy <- dsl_torres_tidy[,colnames(dsl_torres_tidy) %in% colnames(clip_tidy)]
dsl_torres_tidy['term']  <- c("center", "left", "left-center", "not-rated", "right", "right-center")

clip_tidy$term <- c("center", "left", "left-center", "not-rated", "right", "right-center")
truth_tidy$term <- c("center", "left", "left-center", "not-rated", "right", "right-center")
torres_tidy$term <- c("center", "left", "left-center", "not-rated", "right", "right-center")


machine <- as.data.frame(clip_tidy)
machine <- machine[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]
human <- as.data.frame(truth_tidy)
human <- human[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]
top <- as.data.frame(torres_tidy)
top <- top[,c("term", "estimate", "std.error", "p.value", "conf.low", "conf.high")]
alltogether <- rbind(machine, human, dsl_torres_tidy, top)
alltogether$model <- c("CLIP", "CLIP", "CLIP", "CLIP", "CLIP",  "CLIP","Ground truth", "Ground truth", "Ground truth", "Ground truth", "Ground truth", "Ground truth", "DSL 200", "DSL 200", "DSL 200", "DSL 200", "DSL 200","DSL 200",  "Torres", "Torres", "Torres", "Torres", "Torres", "Torres")

torres_1 <- alltogether %>%
  filter(term != "not-rated") %>%
  ggplot(aes(estimate, model, colour = model, shape = model)) + facet_wrap(~ term) +
  scale_shape_manual(values = 0:9) + 
  geom_point(show.legend = FALSE) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), show.legend = FALSE) +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) + coord_cartesian(xlim = c(-0.05, 0.7))+
  labs(title = "Estimate of effect of political leaning on dense crowd proportion by model",
       y = NULL) + theme_bw() + scale_color_grey()
```

## Running multiple times

```{r}
# truth estimate
t_center <- truth_model$coefficients[1]
t_left <- truth_model$coefficients[2]
t_leftcenter <- truth_model$coefficients[3]
t_right <- truth_model$coefficients[5]
t_rightcenter <- truth_model$coefficients[6]

#sd
t_sd_center <-truth_tidy$std.error[1]
t_sd_left <- truth_tidy$std.error[2]
t_sd_leftcenter <- truth_tidy$std.error[3]
t_sd_right <- truth_tidy$std.error[5]
t_sd_rightcenter <-truth_tidy$std.error[6]

#conf interval up
t_up_center <- truth_tidy$conf.high[1]
t_up_left <- truth_tidy$conf.high[2]
t_up_leftcenter <- truth_tidy$conf.high[3]
t_up_right <- truth_tidy$conf.high[5]
t_up_rightcenter <- truth_tidy$conf.high[6]

#conf interval low
t_l_center <-truth_tidy$conf.low[1]
t_l_left <- truth_tidy$conf.low[2]
t_l_leftcenter <- truth_tidy$conf.low[3]
t_l_right <- truth_tidy$conf.low[5]
t_l_rightcenter <- truth_tidy$conf.low[6]
```

```{r}
torres_sampling <- function(torres_df, N, seed){
  set.seed(seed)
  
  # Sample with replacement from the rows of the data frame
  sample_torres <- torres_df[sample(nrow(torres_df), size = nrow(torres_df), replace = TRUE), ]
  
  # Randomly select a subset of rows to assign NA (exclude N rows)
  sub_sample_indices <- sample(nrow(sample_torres), size = (nrow(sample_torres) - N))
  sub_sample_torres <- sample_torres
  sub_sample_torres$row <- seq_len(nrow(sub_sample_torres))
  
  # Assign NA to our labeled observations 
  sub_sample_torres$first[sub_sample_indices] <- NA
  
  return(sub_sample_torres)
  
}
```

Now let's set our seed.

```{r}
k = 500

# set seed for replicability
set.seed(48347834)
seeds <- sample(x=1:99999,size=k)
```

```{r}
torres_sampling <- function(torres_df, N, seed){
  set.seed(seed)

  # Sample with replacement from the rows of the data frame
  sample_torres <- torres_df[sample(nrow(torres_df), size = nrow(torres_df), replace = TRUE), ]

  # Randomly select a subset of rows to assign NA (exclude N rows)
  sub_sample_indices <- sample(nrow(sample_torres), size = (nrow(sample_torres) - N))
  sub_sample_torres <- sample_torres
  sub_sample_torres$row <- seq_len(nrow(sub_sample_torres))
  
  # Assign NA to our labeled observations 
  sub_sample_torres$first[sub_sample_indices] <- NA
  
  return(sub_sample_torres)

}

```

## Width plots

```{r}

library(estimatr)
library(dsl)
library(jtools)
library(tidyverse)
library(broom)
library(gridExtra)
library(devtools)



# 100
multiple_runs_left_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_100 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
 for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 100, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_100[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_100[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_100[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_100[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_100[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_100[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_100[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_100[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_100[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_100[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}

# 150

multiple_runs_left_150 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_150 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 150, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_150[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_150[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_150[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_150[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_150[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_150[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_150[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_150[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_150[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_150[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}

# 200

multiple_runs_left_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_200 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 200, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_200[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_200[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_200[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_200[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_200[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_200[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_200[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_200[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_200[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_200[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}


# 250

multiple_runs_left_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_250 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 250, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_250[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_250[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_250[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_250[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_250[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_250[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_250[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_250[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_250[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_250[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}


# 300

multiple_runs_left_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_300 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 300, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_300[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_300[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_300[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_300[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_300[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_300[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_300[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_300[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_300[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_300[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}

# 150

multiple_runs_left_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_350 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 350, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_350[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_350[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_350[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_350[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_350[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_350[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_350[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_350[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_350[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_350[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}
```

```{r}
multiple_runs_left_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_leftcenter_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_center_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_right_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_rightcenter_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_left_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_leftcenter_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_center_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_right_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_rightcenter_400 <- data.frame(matrix(nrow = 5,ncol =k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


# DSL
for(i in 1:k){
  df_trial <- torres_sampling(torres_df, N = 400, seed = seeds[i])
  tryCatch({
    dsl_model_i <- dsl(formula = first ~ pol -1, data =  df_trial, predicted_var  =  "first", prediction = "value", model = "lm", tuning = TRUE)
    d <- data.frame(summary(dsl_model_i))
    multiple_runs_center_400[,i] <- c(d$Estimate[1], d$Std..Error[1], d$CI.Lower[1], d$CI.Upper[1], d$p.value[1])
    multiple_runs_left_400[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
    multiple_runs_leftcenter_400[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
    multiple_runs_right_400[,i] <- c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
    multiple_runs_rightcenter_400[,i] <-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
    sub_model_i <- lm_robust(formula = first ~ pol -1,   data =  df_trial, se_type = "HC0")
    d<- sub_model_i %>% tidy(conf.int = TRUE)
    d<- as.data.frame(d)
    smultiple_runs_center_400[,i] <- c(d$estimate[1], d$std.error[1], d$conf.low[1], d$conf.high[1], d$p.value[1])
    smultiple_runs_left_400[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
    smultiple_runs_leftcenter_400[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
    smultiple_runs_right_400[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
    smultiple_runs_rightcenter_400[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  }, error=function(e){})
  dsl_model_i <- NA}

```

```{r}
# saving dfs 

cdsl_center_100 <- as.data.frame(t(multiple_runs_center_100))
cdsl_left_100 <- as.data.frame(t(multiple_runs_left_100))
cdsl_leftcenter_100 <- as.data.frame(t(multiple_runs_leftcenter_100))
cdsl_right_100 <- as.data.frame(t(multiple_runs_right_100))
cdsl_rightcenter_100 <- as.data.frame(t(multiple_runs_rightcenter_100))

sdsl_center_100 <- as.data.frame(t(smultiple_runs_center_100))
sdsl_left_100 <- as.data.frame(t(smultiple_runs_left_100))
sdsl_leftcenter_100 <- as.data.frame(t(smultiple_runs_leftcenter_100))
sdsl_right_100 <- as.data.frame(t(smultiple_runs_right_100))
sdsl_rightcenter_100 <- as.data.frame(t(smultiple_runs_rightcenter_100))

cdsl_center_150 <- as.data.frame(t(multiple_runs_center_150))
cdsl_left_150 <- as.data.frame(t(multiple_runs_left_150))
cdsl_leftcenter_150 <- as.data.frame(t(multiple_runs_leftcenter_150))
cdsl_right_150 <- as.data.frame(t(multiple_runs_right_150))
cdsl_rightcenter_150 <- as.data.frame(t(multiple_runs_rightcenter_150))

sdsl_center_150 <- as.data.frame(t(smultiple_runs_center_150))
sdsl_left_150 <- as.data.frame(t(smultiple_runs_left_150))
sdsl_leftcenter_150 <- as.data.frame(t(smultiple_runs_leftcenter_150))
sdsl_right_150 <- as.data.frame(t(smultiple_runs_right_150))
sdsl_rightcenter_150 <- as.data.frame(t(smultiple_runs_rightcenter_150))

cdsl_center_200 <- as.data.frame(t(multiple_runs_center_200))
cdsl_left_200 <- as.data.frame(t(multiple_runs_left_200))
cdsl_leftcenter_200 <- as.data.frame(t(multiple_runs_leftcenter_200))
cdsl_right_200 <- as.data.frame(t(multiple_runs_right_200))
cdsl_rightcenter_200 <- as.data.frame(t(multiple_runs_rightcenter_200))

sdsl_center_200 <- as.data.frame(t(smultiple_runs_center_200))
sdsl_left_200 <- as.data.frame(t(smultiple_runs_left_200))
sdsl_leftcenter_200 <- as.data.frame(t(smultiple_runs_leftcenter_200))
sdsl_right_200 <- as.data.frame(t(smultiple_runs_right_200))
sdsl_rightcenter_200 <- as.data.frame(t(smultiple_runs_rightcenter_200))


cdsl_center_250 <- as.data.frame(t(multiple_runs_center_250))
cdsl_left_250 <- as.data.frame(t(multiple_runs_left_250))
cdsl_leftcenter_250 <- as.data.frame(t(multiple_runs_leftcenter_250))
cdsl_right_250 <- as.data.frame(t(multiple_runs_right_250))
cdsl_rightcenter_250 <- as.data.frame(t(multiple_runs_rightcenter_250))

sdsl_center_250 <- as.data.frame(t(smultiple_runs_center_250))
sdsl_left_250 <- as.data.frame(t(smultiple_runs_left_250))
sdsl_leftcenter_250 <- as.data.frame(t(smultiple_runs_leftcenter_250))
sdsl_right_250 <- as.data.frame(t(smultiple_runs_right_250))
sdsl_rightcenter_250 <- as.data.frame(t(smultiple_runs_rightcenter_250))


cdsl_center_300 <- as.data.frame(t(multiple_runs_center_300))
cdsl_left_300 <- as.data.frame(t(multiple_runs_left_300))
cdsl_leftcenter_300 <- as.data.frame(t(multiple_runs_leftcenter_300))
cdsl_right_300 <- as.data.frame(t(multiple_runs_right_300))
cdsl_rightcenter_300 <- as.data.frame(t(multiple_runs_rightcenter_300))

sdsl_center_300 <- as.data.frame(t(smultiple_runs_center_300))
sdsl_left_300 <- as.data.frame(t(smultiple_runs_left_300))
sdsl_leftcenter_300 <- as.data.frame(t(smultiple_runs_leftcenter_300))
sdsl_right_300 <- as.data.frame(t(smultiple_runs_right_300))
sdsl_rightcenter_300 <- as.data.frame(t(smultiple_runs_rightcenter_300))

cdsl_center_350 <- as.data.frame(t(multiple_runs_center_350))
cdsl_left_350 <- as.data.frame(t(multiple_runs_left_350))
cdsl_leftcenter_350 <- as.data.frame(t(multiple_runs_leftcenter_350))
cdsl_right_350 <- as.data.frame(t(multiple_runs_right_350))
cdsl_rightcenter_350 <- as.data.frame(t(multiple_runs_rightcenter_350))

sdsl_center_350 <- as.data.frame(t(smultiple_runs_center_350))
sdsl_left_350 <- as.data.frame(t(smultiple_runs_left_350))
sdsl_leftcenter_350 <- as.data.frame(t(smultiple_runs_leftcenter_350))
sdsl_right_350 <- as.data.frame(t(smultiple_runs_right_350))
sdsl_rightcenter_350 <- as.data.frame(t(smultiple_runs_rightcenter_350))

cdsl_center_400 <- as.data.frame(t(multiple_runs_center_400))
cdsl_left_400 <- as.data.frame(t(multiple_runs_left_400))
cdsl_leftcenter_400 <- as.data.frame(t(multiple_runs_leftcenter_400))
cdsl_right_400 <- as.data.frame(t(multiple_runs_right_400))
cdsl_rightcenter_400 <- as.data.frame(t(multiple_runs_rightcenter_400))

sdsl_center_400 <- as.data.frame(t(smultiple_runs_center_400))
sdsl_left_400 <- as.data.frame(t(smultiple_runs_left_400))
sdsl_leftcenter_400 <- as.data.frame(t(smultiple_runs_leftcenter_400))
sdsl_right_400 <- as.data.frame(t(smultiple_runs_right_400))
sdsl_rightcenter_400 <- as.data.frame(t(smultiple_runs_rightcenter_400))

# omit NAs 100, 150 and 200 (happen because of insufficient number of subcategories)

cdsl_center_100 <- na.omit(cdsl_center_100)
cdsl_left_100 <- na.omit(cdsl_left_100)
cdsl_leftcenter_100 <- na.omit(cdsl_leftcenter_100)
cdsl_right_100 <- na.omit(cdsl_right_100)
cdsl_rightcenter_100 <- na.omit(cdsl_rightcenter_100)

sdsl_center_100 <- na.omit(sdsl_center_100)
sdsl_left_100 <- na.omit(sdsl_left_100)
sdsl_leftcenter_100 <- na.omit(sdsl_leftcenter_100)
sdsl_right_100 <- na.omit(sdsl_right_100)
sdsl_rightcenter_100 <- na.omit(sdsl_rightcenter_100)

cdsl_center_150 <- na.omit(cdsl_center_150)
cdsl_left_150 <- na.omit(cdsl_left_150)
cdsl_leftcenter_150 <- na.omit(cdsl_leftcenter_150)
cdsl_right_150 <- na.omit(cdsl_right_150)
cdsl_rightcenter_150 <- na.omit(cdsl_rightcenter_150)

sdsl_center_150 <- na.omit(sdsl_center_150)
sdsl_left_150 <- na.omit(sdsl_left_150)
sdsl_leftcenter_150 <- na.omit(sdsl_leftcenter_150)
sdsl_right_150 <- na.omit(sdsl_right_150)
sdsl_rightcenter_150 <- na.omit(sdsl_rightcenter_150)

cdsl_center_200 <- na.omit(cdsl_center_200)
cdsl_left_200 <- na.omit(cdsl_left_200)
cdsl_leftcenter_200 <- na.omit(cdsl_leftcenter_200)
cdsl_right_200 <- na.omit(cdsl_right_200)
cdsl_rightcenter_200 <- na.omit(cdsl_rightcenter_200)

sdsl_center_200 <- na.omit(sdsl_center_200)
sdsl_left_200 <- na.omit(sdsl_left_200)
sdsl_leftcenter_200 <- na.omit(sdsl_leftcenter_200)
sdsl_right_200 <- na.omit(sdsl_right_200)
sdsl_rightcenter_200 <- na.omit(sdsl_rightcenter_200)

sdsl_center_250 <- na.omit(sdsl_center_250)
sdsl_left_250 <- na.omit(sdsl_left_250)
sdsl_leftcenter_250 <- na.omit(sdsl_leftcenter_250)
sdsl_right_250 <- na.omit(sdsl_right_250)
sdsl_rightcenter_250 <- na.omit(sdsl_rightcenter_250)

cdsl_center_250 <- na.omit(cdsl_center_250)
cdsl_left_250 <- na.omit(cdsl_left_250)
cdsl_leftcenter_250 <- na.omit(cdsl_leftcenter_250)
cdsl_right_250 <- na.omit(cdsl_right_250)
cdsl_rightcenter_250 <- na.omit(cdsl_rightcenter_250)


cdsl_center_300 <- na.omit(cdsl_center_300)
cdsl_left_300 <- na.omit(cdsl_left_300)
cdsl_leftcenter_300 <- na.omit(cdsl_leftcenter_300)
cdsl_right_300 <- na.omit(cdsl_right_300)
cdsl_rightcenter_300 <- na.omit(cdsl_rightcenter_300)

sdsl_center_300 <- na.omit(sdsl_center_300)
sdsl_left_300 <- na.omit(sdsl_left_300)
sdsl_leftcenter_300 <- na.omit(sdsl_leftcenter_300)
sdsl_right_300 <- na.omit(sdsl_right_300)
sdsl_rightcenter_300 <- na.omit(sdsl_rightcenter_300)

cdsl_center_350 <- na.omit(cdsl_center_350)
cdsl_left_350 <- na.omit(cdsl_left_350)
cdsl_leftcenter_350 <- na.omit(cdsl_leftcenter_350)
cdsl_right_350 <- na.omit(cdsl_right_350)
cdsl_rightcenter_350 <- na.omit(cdsl_rightcenter_350)

sdsl_center_350 <- na.omit(sdsl_center_350)
sdsl_left_350 <- na.omit(sdsl_left_350)
sdsl_leftcenter_350 <- na.omit(sdsl_leftcenter_350)
sdsl_right_350 <- na.omit(sdsl_right_350)
sdsl_rightcenter_350 <- na.omit(sdsl_rightcenter_350)

cdsl_center_400 <- na.omit(cdsl_center_400)
cdsl_left_400 <- na.omit(cdsl_left_400)
cdsl_leftcenter_400 <- na.omit(cdsl_leftcenter_400)
cdsl_right_400 <- na.omit(cdsl_right_400)
cdsl_rightcenter_400 <- na.omit(cdsl_rightcenter_400)

sdsl_center_400 <- na.omit(sdsl_center_400)
sdsl_left_400 <- na.omit(sdsl_left_400)
sdsl_leftcenter_400 <- na.omit(sdsl_leftcenter_400)
sdsl_right_400 <- na.omit(sdsl_right_400)
sdsl_rightcenter_400 <- na.omit(sdsl_rightcenter_400)


# seven iterations


confinter <- data.frame(names = c("center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter"))


confinter$up <- c(t_up_center, t_up_left, t_up_leftcenter,t_up_right, t_up_rightcenter, mean(cdsl_center_100$conf.high),  mean(cdsl_left_100$conf.high),  mean(cdsl_leftcenter_100$conf.high),  mean(cdsl_right_100$conf.high),  mean(cdsl_rightcenter_100$conf.high), mean(cdsl_center_150$conf.high),  mean(cdsl_left_150$conf.high),  mean(cdsl_leftcenter_150$conf.high),  mean(cdsl_right_150$conf.high),  mean(cdsl_rightcenter_150$conf.high), mean(cdsl_center_200$conf.high),  mean(cdsl_left_200$conf.high),  mean(cdsl_leftcenter_200$conf.high),  mean(cdsl_right_200$conf.high),  mean(cdsl_rightcenter_200$conf.high), mean(cdsl_center_250$conf.high),  mean(cdsl_left_250$conf.high),  mean(cdsl_leftcenter_250$conf.high),  mean(cdsl_right_250$conf.high),  mean(cdsl_rightcenter_250$conf.high), mean(cdsl_center_300$conf.high),  mean(cdsl_left_300$conf.high),  mean(cdsl_leftcenter_300$conf.high),  mean(cdsl_right_300$conf.high),  mean(cdsl_rightcenter_300$conf.high), mean(cdsl_center_350$conf.high),  mean(cdsl_left_350$conf.high),  mean(cdsl_leftcenter_350$conf.high),  mean(cdsl_right_350$conf.high),  mean(cdsl_rightcenter_350$conf.high), mean(cdsl_center_400$conf.high),  mean(cdsl_left_400$conf.high),  mean(cdsl_leftcenter_400$conf.high),  mean(cdsl_right_400$conf.high),  mean(cdsl_rightcenter_400$conf.high))

confinter$low <- c(t_l_center, t_l_left, t_l_leftcenter,t_l_right, t_l_rightcenter, mean(cdsl_center_100$conf.low),  mean(cdsl_left_100$conf.low),  mean(cdsl_leftcenter_100$conf.low),  mean(cdsl_right_100$conf.low),  mean(cdsl_rightcenter_100$conf.low), mean(cdsl_center_150$conf.low),  mean(cdsl_left_150$conf.low),  mean(cdsl_leftcenter_150$conf.low),  mean(cdsl_right_150$conf.low),  mean(cdsl_rightcenter_150$conf.low), mean(cdsl_center_200$conf.low),  mean(cdsl_left_200$conf.low),  mean(cdsl_leftcenter_200$conf.low),  mean(cdsl_right_200$conf.low),  mean(cdsl_rightcenter_200$conf.low), mean(cdsl_center_250$conf.low),  mean(cdsl_left_250$conf.low),  mean(cdsl_leftcenter_250$conf.low),  mean(cdsl_right_250$conf.low),  mean(cdsl_rightcenter_250$conf.low), mean(cdsl_center_300$conf.low),  mean(cdsl_left_300$conf.low),  mean(cdsl_leftcenter_300$conf.low),  mean(cdsl_right_300$conf.low),  mean(cdsl_rightcenter_300$conf.low), mean(cdsl_center_350$conf.low),  mean(cdsl_left_350$conf.low),  mean(cdsl_leftcenter_350$conf.low),  mean(cdsl_right_350$conf.low),  mean(cdsl_rightcenter_350$conf.low), mean(cdsl_center_400$conf.low),  mean(cdsl_left_400$conf.low),  mean(cdsl_leftcenter_400$conf.low),  mean(cdsl_right_400$conf.low),  mean(cdsl_rightcenter_400$conf.low))



confinter$type <- c("groundtruth","groundtruth", "groundtruth","groundtruth", "groundtruth", "100", "100", "100", "100", "100", "150", "150", "150", "150", "150", "200", "200", "200", "200", "200", "250", "250", "250", "250", "250", "300", "300", "300", "300", "300", "350", "350", "350", "350", "350", "400", "400", "400", "400", "400")



confinter$width <- abs(confinter$low - confinter$up)

dsl100ci <- confinter[confinter$type == "100",]
dsl150ci <- confinter[confinter$type == "150",]
dsl200ci <- confinter[confinter$type == "200",]
dsl250ci <- confinter[confinter$type == "250",]
dsl300ci <- confinter[confinter$type == "300",]
dsl350ci <- confinter[confinter$type == "350",]
dsl400ci <- confinter[confinter$type == "400",]
oracleci <- confinter[confinter$type == "groundtruth",]
dsl100ci$div <- dsl100ci$width / oracleci$width
dsl150ci$div <- dsl150ci$width / oracleci$width
dsl200ci$div <- dsl200ci$width / oracleci$width
dsl250ci$div <- dsl250ci$width / oracleci$width
dsl300ci$div <- dsl300ci$width / oracleci$width
dsl350ci$div <- dsl350ci$width / oracleci$width
dsl400ci$div <- dsl400ci$width / oracleci$width
plotci <- rbind(dsl100ci,dsl150ci,  dsl200ci,dsl250ci, dsl300ci, dsl350ci, dsl400ci)
plotci$annotations <- c(100,100,100,100,100,150,150,150,150,150,200,200,200,200,200,250,250,250,250,250,300,300,300,300,300,350,350,350,350,350,400,400,400,400,400)
ggplot(plotci, aes(y = div, x = annotations,color = names, group = names, shape = names)) + geom_point() + geom_line() + labs(title = "Width of DSL CI/Width of Oracle CI", y = "Width of DSL CI/Width of Oracle CI", x = "Number of annotations") +theme(plot.title=element_text(face="bold"))

torresconf <- ggplot(plotci, aes(y = div, x = annotations,color = names, group = names, shape = names)) + geom_point() + geom_line() + labs(title = "Ratio between DSL CI and Oracle CI by number of annotations", y = "Width of DSL CI/Width of Oracle CI", x = "Number of annotations") +theme(plot.title=element_text(face="bold"))+ theme(legend.position="none") + geom_hline(yintercept=1, linetype="dashed", color = "black") + theme_bw()

#### subsample
sconfinter <- data.frame(names = c("center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter", "center", "left", "leftcenter", "right", "rightcenter"))


sconfinter$up <- c(t_up_center, t_up_left, t_up_leftcenter,t_up_right, t_up_rightcenter, mean(sdsl_center_100$conf.high),  mean(sdsl_left_100$conf.high),  mean(sdsl_leftcenter_100$conf.high),  mean(sdsl_right_100$conf.high),  mean(sdsl_rightcenter_100$conf.high), mean(sdsl_center_150$conf.high),  mean(sdsl_left_150$conf.high),  mean(sdsl_leftcenter_150$conf.high),  mean(sdsl_right_150$conf.high),  mean(sdsl_rightcenter_150$conf.high), mean(sdsl_center_200$conf.high),  mean(sdsl_left_200$conf.high),  mean(sdsl_leftcenter_200$conf.high),  mean(sdsl_right_200$conf.high),  mean(sdsl_rightcenter_200$conf.high), mean(sdsl_center_250$conf.high),  mean(sdsl_left_250$conf.high),  mean(sdsl_leftcenter_250$conf.high),  mean(sdsl_right_250$conf.high),  mean(sdsl_rightcenter_250$conf.high), mean(sdsl_center_300$conf.high),  mean(sdsl_left_300$conf.high),  mean(sdsl_leftcenter_300$conf.high),  mean(sdsl_right_300$conf.high),  mean(sdsl_rightcenter_300$conf.high), mean(sdsl_center_350$conf.high),  mean(sdsl_left_350$conf.high),  mean(sdsl_leftcenter_350$conf.high),  mean(sdsl_right_350$conf.high),  mean(sdsl_rightcenter_350$conf.high), mean(sdsl_center_400$conf.high),  mean(sdsl_left_400$conf.high),  mean(sdsl_leftcenter_400$conf.high),  mean(sdsl_right_400$conf.high),  mean(sdsl_rightcenter_400$conf.high))

sconfinter$low <- c(t_l_center, t_l_left, t_l_leftcenter,t_l_right, t_l_rightcenter, mean(sdsl_center_100$conf.low),  mean(sdsl_left_100$conf.low),  mean(sdsl_leftcenter_100$conf.low),  mean(sdsl_right_100$conf.low),  mean(sdsl_rightcenter_100$conf.low), mean(sdsl_center_150$conf.low),  mean(sdsl_left_150$conf.low),  mean(sdsl_leftcenter_150$conf.low),  mean(sdsl_right_150$conf.low),  mean(sdsl_rightcenter_150$conf.low), mean(sdsl_center_200$conf.low),  mean(sdsl_left_200$conf.low),  mean(sdsl_leftcenter_200$conf.low),  mean(sdsl_right_200$conf.low),  mean(sdsl_rightcenter_200$conf.low), mean(sdsl_center_250$conf.low),  mean(sdsl_left_250$conf.low),  mean(sdsl_leftcenter_250$conf.low),  mean(sdsl_right_250$conf.low),  mean(sdsl_rightcenter_250$conf.low), mean(sdsl_center_300$conf.low),  mean(sdsl_left_300$conf.low),  mean(sdsl_leftcenter_300$conf.low),  mean(sdsl_right_300$conf.low),  mean(sdsl_rightcenter_300$conf.low), mean(sdsl_center_350$conf.low),  mean(sdsl_left_350$conf.low),  mean(sdsl_leftcenter_350$conf.low),  mean(sdsl_right_350$conf.low),  mean(sdsl_rightcenter_350$conf.low), mean(sdsl_center_400$conf.low),  mean(sdsl_left_400$conf.low),  mean(sdsl_leftcenter_400$conf.low),  mean(sdsl_right_400$conf.low),  mean(sdsl_rightcenter_400$conf.low))



sconfinter$type <- c("groundtruth","groundtruth", "groundtruth","groundtruth", "groundtruth", "100", "100", "100", "100", "100", "150", "150", "150", "150", "150", "200", "200", "200", "200", "200", "250", "250", "250", "250", "250", "300", "300", "300", "300", "300", "350", "350", "350", "350", "350", "400", "400", "400", "400", "400")


sconfinter$width <- abs(sconfinter$low - sconfinter$up)

sdsl100ci <- sconfinter[sconfinter$type == "100",]
sdsl150ci <- sconfinter[sconfinter$type == "150",]
sdsl200ci <- sconfinter[sconfinter$type == "200",]
sdsl250ci <- sconfinter[sconfinter$type == "250",]
sdsl300ci <- sconfinter[sconfinter$type == "300",]
sdsl350ci <- sconfinter[sconfinter$type == "350",]
sdsl400ci <- sconfinter[sconfinter$type == "400",]
oracleci <- sconfinter[sconfinter$type == "groundtruth",]
sdsl100ci$div <- dsl100ci$width / sdsl100ci$width 
sdsl150ci$div <- dsl150ci$width / sdsl150ci$width 
sdsl200ci$div <- dsl200ci$width / sdsl200ci$width 
sdsl250ci$div <- dsl250ci$width / sdsl250ci$width 
sdsl300ci$div <- dsl300ci$width / sdsl300ci$width
sdsl350ci$div <- dsl350ci$width / sdsl350ci$width
sdsl400ci$div <- dsl400ci$width / sdsl400ci$width 

splotci <- rbind(sdsl100ci,sdsl150ci,  sdsl200ci,sdsl250ci, sdsl300ci, sdsl350ci, sdsl400ci)
splotci$annotations <- c(100,100,100,100,100,150,150,150,150,150,200,200,200,200,200,250,250,250,250,250,300,300,300,300,300,350,350,350,350,350,400,400,400,400,400)

```

```{r}
p1 <- ggplot(plotci, aes(y = div, x = annotations, color = names, group = names, shape = names)) + 
  geom_point() + 
  geom_line() + 
  labs(
    title = "Ratio between DSL CI and Oracle CI by number of annotations", 
    y = "Width of DSL/Oracle CI", 
    x = "Number of annotations"
  ) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black") + 
  theme_bw() +
  theme(
    plot.title = element_text(size = 14),  # Increase title font size
    axis.title = element_text(size = 13),  # Increase axis title font size
    axis.text = element_text(size = 13),   # Increase axis text font size
    strip.text = element_text(size = 13),  # Increase facet label font size
    legend.text = element_text(size = 13), # Increase legend text font size
    legend.title = element_text(size = 13),
    legend.position = "none"# Increase legend title font size
  )


p2 <- ggplot(splotci, aes(y = div, x = annotations,color = names, group = names, shape = names)) + geom_point() + geom_line() + labs(title = "Ratio between DSL CI and sub-sample CI by number of annotations", y = "Width of DSL/sub-sample CI", x = "Number of annotations") +theme(plot.title=element_text(face="bold"))+ geom_hline(yintercept=1, linetype="dashed", color = "black") + theme_bw() +
  theme(
    plot.title = element_text(size = 14), # Increase title font size
    axis.title = element_text(size = 13),               # Increase axis title font size
    axis.text = element_text(size = 13),                # Increase axis text font size
    strip.text = element_text(size = 13),               # Increase facet label font size
    legend.text = element_text(size = 13),              # Increase legend text font size
    legend.title = element_text(size = 13),
    legend.position = "bottom" # Increase legend title font size
  ) 
```
